{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample, shuffle\n",
    "import math\n",
    "\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode \n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.annotations.add_bounding_boxes_to_megadb import *\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json\n",
    "\n",
    "import sas_blob_utils"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Importing Season 1 of the Snapshot Safari projects\n",
    "\n",
    "This notebook imports the first season of the 6 Snapshot Safari datasets into the MegaDB format. \n",
    "\n",
    "Each Snapshot Safari project has its own MegaDB dataset with public images, so that the species can be mapped separately in case they are named differently. Future seasons can be added to these datasets.\n",
    "\n",
    "All private images are lumped into one dataset `snapshot_safari_private` however, as the species there are few."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List all images in the private container as they need to be a separate dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SAS key to where the private images are stored\n",
    "container_sas = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "0it [00:00, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "listing blobs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2616it [00:02, 1073.69it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enumerated 2616 matching blobs out of 2616 total\n",
      "CPU times: user 871 ms, sys: 46.3 ms, total: 917 ms\n",
      "Wall time: 2.45 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "list_private_images = sas_blob_utils.list_blobs_in_container(\n",
    "    container_uri=container_sas\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ENO_S1_private/ENO_S1/D04/D04_R1/ENO_S1_D04_R1_IMAG0030.JPG'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_private_images[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_private_images = ['/'.join(i.split('/')[1:]) for i in list_private_images]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ENO_S1/D04/D04_R1/ENO_S1_D04_R1_IMAG0030.JPG'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_private_images[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_private_images = set(list_private_images)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting to MegaDB format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "name_to_cct = {\n",
    "    'snapshot_karoo': 'SnapshotKaroo_S1_v1.0.json',\n",
    "    'snapshot_enonkishu': 'SnapshotEnonkishu_S1_v1.0.json',\n",
    "    'snapshot_camdeboo': 'SnapshotCamdeboo_S1_v1.0.json',\n",
    "    'snapshot_mountain_zebra': 'SnapshotMountainZebra_S1_v1.0.json',\n",
    "    'snapshot_kruger': 'SnapshotKruger_S1_v1.0.json',\n",
    "    'snapshot_kgalagadi': 'SnapshotKgalagai_S1_v1.0.json'\n",
    "}\n",
    "\n",
    "private_set_name = 'snapshot_safari_private'\n",
    "\n",
    "cct_dir = '.../data/CameraTraps/CCT_JSONs'\n",
    "output_dir = '.../AI4Earth/CameraTrap/Databases/megadb_2020/to_ingest'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading image DB...\n",
      "Number of items from the image DB: 38293\n",
      "Number of images with more than 1 species: 27 (0.07% of image DB)\n",
      "No bbox DB provided.\n",
      "The dataset_name is set to snapshot_karoo. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 38074/38074 [00:00<00:00, 281090.91it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 38074 images into sequences...\n",
      "Number of sequences: 14806\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "all_img_properties\n",
      "{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "img_level_properties\n",
      "{'file', 'frame_num', 'id'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "OrderedDict([('dataset', 'snapshot_karoo'), ('seq_id', 'KAR_S1#A01#1#1'), ('location', 'A01'), ('images', [{'id': 'KAR_S1/A01/A01_R1/KAR_S1_A01_R1_IMAG00008', 'frame_num': 1, 'file': 'KAR_S1/A01/A01_R1/KAR_S1_A01_R1_IMAG00008.JPG'}]), ('class', ['empty']), ('datetime', '2017-10-04 01:01:44'), ('resting', None), ('standing', None), ('moving', None), ('young_present', None), ('count', None), ('season', 'KAR_S1'), ('subject_id', 28794480), ('interacting', None)])\n",
      "\n",
      "[OrderedDict([('dataset', 'snapshot_karoo'), ('seq_id', 'KAR_S1#C04#1#1803'), ('location', 'C04'), ('images', [{'id': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4199', 'frame_num': 1, 'file': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4199.JPG'}, {'id': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4200', 'frame_num': 2, 'file': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4200.JPG'}, {'id': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4201', 'frame_num': 3, 'file': 'KAR_S1/C04/C04_R1/KAR_S1_C04_R1_IMAG4201.JPG'}]), ('class', ['empty']), ('datetime', '2017-10-08 08:23:06'), ('resting', None), ('standing', None), ('moving', None), ('young_present', None), ('count', None), ('season', 'KAR_S1'), ('subject_id', 28813119), ('interacting', None)])]\n",
      "\n",
      "===========================================\n",
      "Loading image DB...\n",
      "Number of items from the image DB: 29414\n",
      "Number of images with more than 1 species: 1070 (3.64% of image DB)\n",
      "No bbox DB provided.\n",
      "The dataset_name is set to snapshot_enonkishu. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 28544/28544 [00:00<00:00, 289525.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 28544 images into sequences...\n",
      "Number of sequences: 12969\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "all_img_properties\n",
      "{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "img_level_properties\n",
      "{'file', 'frame_num', 'id'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "OrderedDict([('dataset', 'snapshot_enonkishu'), ('seq_id', 'ENO_S1#B02#1#7'), ('location', 'B02'), ('images', [{'id': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0017', 'frame_num': 1, 'file': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0017.JPG'}, {'id': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0018', 'frame_num': 2, 'file': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0018.JPG'}, {'id': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0019', 'frame_num': 3, 'file': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0019.JPG'}]), ('class', ['birdother', 'warthog', 'gazellegrants']), ('datetime', '2018-11-03 12:02:37'), ('resting', 0.0), ('standing', 0.1), ('moving', 1.0), ('young_present', 0.8), ('count', '3'), ('season', 'ENO_S1'), ('subject_id', 31878055), ('interacting', 0.0)])\n",
      "\n",
      "[OrderedDict([('dataset', 'snapshot_enonkishu'), ('seq_id', 'ENO_S1#C05#2#56'), ('location', 'C05'), ('images', [{'id': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0100', 'frame_num': 1, 'file': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0100.JPG'}, {'id': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0101', 'frame_num': 2, 'file': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0101.JPG'}, {'id': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0102', 'frame_num': 3, 'file': 'ENO_S1/C05/C05_R2/ENO_S1_C05_R2_IMAG0102.JPG'}]), ('class', ['gazellethomsons']), ('datetime', '2018-10-29 08:07:11'), ('resting', 0.0), ('standing', 0.67), ('moving', 0.89), ('young_present', 0.11), ('count', '4'), ('season', 'ENO_S1'), ('subject_id', 31955698), ('interacting', 0.0)])]\n",
      "\n",
      "===========================================\n",
      "Loading image DB...\n",
      "Number of items from the image DB: 30551\n",
      "Number of images with more than 1 species: 166 (0.54% of image DB)\n",
      "No bbox DB provided.\n",
      "The dataset_name is set to snapshot_camdeboo. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 30227/30227 [00:00<00:00, 280110.79it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 30227 images into sequences...\n",
      "Number of sequences: 12024\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "all_img_properties\n",
      "{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "img_level_properties\n",
      "{'file', 'frame_num', 'id'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}\n",
      "\n",
      "! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.\n",
      "! Sequence-level property datetime with value  should be a dataset-level property. Removed from sequences.\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "OrderedDict([('dataset', 'snapshot_camdeboo'), ('seq_id', 'CDB_S1#A05#1#3'), ('location', 'A05'), ('images', [{'id': 'CDB_S1/A05/A05_R1/CDB_S1_A05_R1_IMAG0007', 'frame_num': 1, 'file': 'CDB_S1/A05/A05_R1/CDB_S1_A05_R1_IMAG0007.JPG'}]), ('class', ['empty']), ('resting', None), ('standing', None), ('moving', None), ('young_present', None), ('count', None), ('season', 'CDB_S1'), ('subject_id', 32961811), ('interacting', None)])\n",
      "\n",
      "[OrderedDict([('dataset', 'snapshot_camdeboo'), ('seq_id', 'CDB_S1#E05#2#68'), ('location', 'E05'), ('images', [{'id': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0152', 'frame_num': 1, 'file': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0152.JPG'}, {'id': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0153', 'frame_num': 2, 'file': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0153.JPG'}, {'id': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0154', 'frame_num': 3, 'file': 'CDB_S1/E05/E05_R2/CDB_S1_E05_R2_IMAG0154.JPG'}]), ('class', ['monkeyvervet']), ('resting', 0.0), ('standing', 0.0), ('moving', 1.0), ('young_present', 0.0), ('count', '1'), ('season', 'CDB_S1'), ('subject_id', 33000590), ('interacting', 0.0)])]\n",
      "\n",
      "===========================================\n",
      "Loading image DB...\n",
      "Number of items from the image DB: 73570\n",
      "Number of images with more than 1 species: 36 (0.05% of image DB)\n",
      "No bbox DB provided.\n",
      "The dataset_name is set to snapshot_mountain_zebra. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 76%|███████▋  | 55799/73034 [00:00<00:00, 278957.69it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 73034 images into sequences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 73034/73034 [00:00<00:00, 271561.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of sequences: 71178\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n"
     ]
    }
   ],
   "source": [
    "name_to_sequences = {}\n",
    "private_embedded = []\n",
    "\n",
    "for dataset_name, cct_fn in name_to_cct.items():\n",
    "    path_to_image_cct = os.path.join(cct_dir, cct_fn)\n",
    "    embedded_all = make_cct_embedded(image_db=path_to_image_cct)\n",
    "    \n",
    "    # move private images out\n",
    "    embedded = []\n",
    "    for i in embedded_all:\n",
    "        if i['file_name'] in list_private_images:\n",
    "            private_embedded.append(i)\n",
    "        else:\n",
    "            embedded.append(i)\n",
    "    sequences = process_sequences(embedded, dataset_name)\n",
    "    name_to_sequences[dataset_name] = sequences\n",
    "    print('===========================================')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(private_embedded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# pass the schema check\n",
    "for dataset_name, sequences in name_to_sequences.items():\n",
    "    print(f'Dataset {dataset_name}')\n",
    "    sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# private set\n",
    "\n",
    "private_sequences = process_sequences(private_embedded, private_set_name)\n",
    "sequences_schema_check.sequences_schema_check(private_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name_to_sequences[private_set_name] = private_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(name_to_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dataset_name, sequences in name_to_sequences.items():\n",
    "    output_path = os.path.join(output_dir, f'{dataset_name}_megadb.json')\n",
    "    \n",
    "    with open(output_path, 'w') as f:\n",
    "        json.dump(sequences, f, indent=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Forgot to prepend the prefix to each dataset in the private dataset\n",
    "\n",
    "so they can share a common prefix and be lumped into one dataset.\n",
    "\n",
    "Also prepend the dataset indicator to the location field so the location values are unique in the `snapshot_safari_private` dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('.../CameraTrap/Databases/megadb_2020/to_ingest/OLDsnapshot_safari_private_megadb.json') as f:\n",
    "    private_sequences = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1257"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(private_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "collection_names = set()\n",
    "\n",
    "for seq in private_sequences:\n",
    "    collection_name = seq['seq_id'].split('#')[0]\n",
    "    collection_names.add(collection_name)\n",
    "    \n",
    "    seq['location'] = f'{collection_name}_{seq[\"location\"]}'\n",
    "    \n",
    "    for im in seq['images']:\n",
    "        old_file_path = im['file']\n",
    "        im['file'] = f'{collection_name}_private/{old_file_path}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CDB_S1', 'ENO_S1', 'KAR_S1', 'KGA_S1', 'KRU_S1', 'MTZ_S1'}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collection_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('.../megadb_2020/to_ingest/snapshot_safari_private_megadb.json', 'w') as f:\n",
    "    json.dump(private_sequences, f, indent=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fixing location values in `snapshot_safari_private`\n",
    "\n",
    "to get rid of the season number since locations are the same across seasons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from azure.cosmos.cosmos_client import CosmosClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize Cosmos DB client\n",
    "url = os.environ['COSMOS_ENDPOINT']\n",
    "key = os.environ['COSMOS_WRITE_KEY']\n",
    "client = CosmosClient(url, credential=key)\n",
    "\n",
    "database = client.get_database_client('camera-trap')\n",
    "container_sequences = database.get_container_client('sequences')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 1257\n",
      "CPU times: user 147 ms, sys: 21.2 ms, total: 168 ms\n",
      "Wall time: 2.54 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT *\n",
    "FROM sequences seq\n",
    "'''\n",
    "\n",
    "partition_key = 'snapshot_safari_private'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in results:\n",
    "    assert 'location' in i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_locations = set()\n",
    "\n",
    "for i in results:\n",
    "    parts = i['location'].split('_')\n",
    "    new_loc = f'{parts[0]}_{parts[2]}'\n",
    "    i['location'] = new_loc\n",
    "    new_locations.add(new_loc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "124"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(new_locations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_locations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in results:\n",
    "    for prop in ['_rid', '_self', '_etag', '_attachments', '_ts']:\n",
    "        if prop in i:\n",
    "            del i[prop]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_json('.../CameraTrap/Databases/megadb_2020/snapshot_safari_private_megadb.json', results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can use the bulk ingestion tool and choose the \"Update\" option to fix these sequence entries (could have updated via the Python SDK too since there are so few entries)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 20\n"
     ]
    }
   ],
   "source": [
    "# verify they are updated\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 20 *\n",
    "FROM sequences seq\n",
    "'''\n",
    "\n",
    "partition_key = 'snapshot_safari_private'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify the splits of the Snapshot Safari dataset\n",
    "\n",
    "We are doing this so that the same location in the animal and the private sets is in the same split.\n",
    "\n",
    "Splits are 75 : 12.5 : 12.5 for train:val:test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_locations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name_to_code = {\n",
    "    'snapshot_karoo': 'KAR',\n",
    "    'snapshot_enonkishu': 'ENO',\n",
    "    'snapshot_camdeboo': 'CDB',\n",
    "    'snapshot_mountain_zebra': 'MTZ',\n",
    "    'snapshot_kruger': 'KRU',\n",
    "    'snapshot_kgalagadi': 'KGA'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "public_sets_dir = '.../CameraTrap/Databases/megadb_2020/snapshot_safari/public'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_to_locations = defaultdict(set)\n",
    "\n",
    "for fn in os.listdir(public_sets_dir):\n",
    "    with open(os.path.join(public_sets_dir, fn)) as f:\n",
    "        sequences = json.load(f)\n",
    "    for seq in sequences:\n",
    "        dataset_to_locations[seq['dataset']].add(seq['location'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "snapshot_camdeboo has 20 locations\n",
      "snapshot_mountain_zebra has 19 locations\n",
      "snapshot_enonkishu has 16 locations\n",
      "snapshot_kgalagadi has 20 locations\n",
      "snapshot_kruger has 39 locations\n",
      "snapshot_karoo has 15 locations\n"
     ]
    }
   ],
   "source": [
    "for dataset, locations in dataset_to_locations.items():\n",
    "    print(f'{dataset} has {len(locations)} locations')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset snapshot_camdeboo\n",
      "first loc before shuffling is C05\n",
      "first loc after shuffling is C05\n",
      "dataset snapshot_mountain_zebra\n",
      "first loc before shuffling is E04\n",
      "first loc after shuffling is E03\n",
      "dataset snapshot_enonkishu\n",
      "first loc before shuffling is C02\n",
      "first loc after shuffling is D03\n",
      "dataset snapshot_kgalagadi\n",
      "first loc before shuffling is B02\n",
      "first loc after shuffling is B07\n",
      "dataset snapshot_kruger\n",
      "first loc before shuffling is 5\n",
      "first loc after shuffling is 11\n",
      "dataset snapshot_karoo\n",
      "first loc before shuffling is C02\n",
      "first loc after shuffling is C04\n"
     ]
    }
   ],
   "source": [
    "splits_table = []\n",
    "\n",
    "for dataset, locations in dataset_to_locations.items():\n",
    "    print(f'dataset {dataset}')\n",
    "    li_locations = list(locations)\n",
    "    print(f'first loc before shuffling is {li_locations[0]}')\n",
    "    shuffle(li_locations)\n",
    "    print(f'first loc after shuffling is {li_locations[0]}')\n",
    "    \n",
    "    num_train = round(0.75 * len(locations))\n",
    "    num_val = round(0.125 * len(locations))\n",
    "    \n",
    "    locs_train = li_locations[:num_train]\n",
    "    locs_val = li_locations[num_train:num_train + num_val]\n",
    "    locs_test = li_locations[num_train + num_val:]\n",
    "    \n",
    "    assert len(locs_train) + len(locs_val) + len(locs_test) == len(li_locations)\n",
    "    \n",
    "    splits_table.append({\n",
    "        'dataset': dataset,\n",
    "        'train': locs_train,\n",
    "        'val': locs_val,\n",
    "        'test': locs_test\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "private_set_splits = {\n",
    "    'train': [],\n",
    "    'val': [],\n",
    "    'test': [],\n",
    "    'dataset': 'snapshot_safari_private'\n",
    "}\n",
    "\n",
    "for dataset_split in splits_table:\n",
    "    dataset = dataset_split['dataset']\n",
    "    for s in ['train', 'val', 'test']:\n",
    "        locs = dataset_split[s]\n",
    "        for loc in locs:\n",
    "            loc_str_in_private_set = f'{dataset_name_to_code[dataset]}_{loc}'\n",
    "            if loc_str_in_private_set in new_locations:\n",
    "                # don't put the location in the split table if there are no data in the private set\n",
    "                private_set_splits[s].append(loc_str_in_private_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits_table.append(private_set_splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('.../CameraTrap/Databases/megadb_2020/snapshot_safari/snapshot_safari_splits.json', 'w') as f:\n",
    "    json.dump(splits_table, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
